library(ngsReports)
library(magrittr)
library(scales)
library(pander)
library(tidyverse)
if (interactive()) setwd(here::here("R"))
rawFqc <- list.files("../0_rawData/FastQC/", pattern = "zip", full.names = TRUE) %>%
FastqcDataList()
trimFqc <- list.files("../1_trimmed/FastQC/", pattern = "zip", full.names = TRUE) %>%
FastqcDataList()
deMuxFqc <- list.files("../2_demux/FastQC/", pattern = "zip", full.names = TRUE) %>%
FastqcDataList()
origFqc <- list.files("../previousAnalysis/FastQC/", pattern = "fastqc.zip", full.names = TRUE) %>%
FastqcDataList()
samples <- read_tsv("../0_rawData/samples.tsv")
Each of the libraries was inspected for overall quality. Positions 3 & 4 from R2 libraries FCC21WPACXX-CHKPEI13070002_L6 and FCC21WPACXX-CHKPEI13070003_L7 showed clear problems with read qualities. This was likely due to an unsatisfactory nucleotide diversity in the original sequencing run and not enough phiX to overcome this, particularly as all R2 reads will begin with the restriction site (i.e. fragments will terminate with this as there is no R2 barcode).
plotBaseQuals(rawFqc, plotType = "boxplot")
Base qualities before trimming
plotBaseQuals(trimFqc, plotType = "boxplot")
Base qualities after trimming
list(
readTotals(rawFqc) %>% mutate(Type = "Raw"),
readTotals(trimFqc) %>% mutate(Type = "Trimmed")
) %>%
bind_rows() %>%
spread(key = "Type", value = "Total_Sequences") %>%
mutate(Library = str_remove(Filename, "_[12].fq.gz")) %>%
distinct(Library, .keep_all = TRUE) %>%
dplyr::select(Library, Raw, Trimmed) %>%
mutate(
Retained = percent(Trimmed / Raw),
Discarded = percent((Raw - Trimmed) / Raw)
) %>%
arrange(Discarded) %>%
bind_rows(
tibble(
Library = "Total",
Raw = sum(.$Raw),
Trimmed = sum(.$Trimmed)
) %>%
mutate(
Retained = percent(Trimmed / Raw),
Discarded = percent((Raw - Trimmed) / Raw)
)
) %>%
pander(
big.mark = ",",
justify = "lrrrr",
style = "rmarkdown",
split.tables = Inf,
emphasize.strong.rows = 8,
caption = "*Results from adapter removal. Reads < 70bp after trimming were discarded during this process*"
)
| Library | Raw | Trimmed | Retained | Discarded |
|---|---|---|---|---|
| FCC229TACXX-CHKPEI13070001_L3 | 56,981,567 | 55,762,854 | 97.861% | 2.139% |
| FCC2GPDACXX-CHKPEI13070007_L6 | 66,601,007 | 64,940,945 | 97.507% | 2.493% |
| FCC2GPDACXX-CHKPEI13070004_L2 | 78,411,565 | 76,201,274 | 97.181% | 2.819% |
| FCC2GPDACXX-CHKPEI13070006_L4 | 70,874,412 | 68,811,191 | 97.089% | 2.911% |
| FCC21WPACXX-CHKPEI13070003_L7 | 65,481,942 | 63,542,676 | 97.038% | 2.962% |
| FCC2GPDACXX-CHKPEI13070005_L3 | 78,445,394 | 76,021,945 | 96.911% | 3.089% |
| FCC21WPACXX-CHKPEI13070002_L6 | 69,729,067 | 67,032,866 | 96.133% | 3.867% |
| Total | 486,524,954 | 472,313,751 | 97% | 3% |
The first check after demultiplexing is to ensure that read were not assigned to multiple individuals by sabre pe, given that one mismatch was permitted per barcode/restriction-site. Read Totals before and after demultiplexing were then checked and the recovery rate was >93% for each library, with the clear exception of FCC21WPACXX-CHKPEI13070002_L6. Manual inspection revealed that a significant majority (~9.3x106 of 13.4x106) of these non-recovered reads contained truncated barcodes with the first two bases missing. If an additional recovery step was taken this would increase the recovery rate to 93% for this library, and may yield an additional 5-600,000 reads per sample. For the next most poorly recovered library, an additional step may recover an additional 3x106 reads. However, no further action was taken at this point.
trimReadTotals <- readTotals(trimFqc) %>%
mutate(Library = str_remove_all(Filename, "_[12].fq.gz")) %>%
distinct(Library, Total_Sequences)
deMuxReadTotals <- readTotals(deMuxFqc) %>%
mutate(ID = str_remove_all(Filename, ".[12].fq.gz")) %>%
distinct(ID, Total_Sequences) %>%
left_join(samples, by = "ID") %>%
group_by(Library) %>%
summarise(DeMultiplexed = sum(Total_Sequences)) %>%
filter(!is.na(Library))
trimReadTotals %>%
left_join(deMuxReadTotals) %>%
mutate(`Recovery Rate` = DeMultiplexed / Total_Sequences) %>%
dplyr::select(Library, everything()) %>%
dplyr::rename(`Total Sequences` = Total_Sequences) %>%
mutate(`Recovery Rate` = percent(`Recovery Rate`)) %>%
arrange(`Recovery Rate`) %>%
bind_rows(
tibble(
Library = "Total",
`Total Sequences` = sum(.$`Total Sequences`),
DeMultiplexed = sum(.$DeMultiplexed)
) %>%
mutate(
`Recovery Rate` = percent(DeMultiplexed / `Total Sequences`)
)
) %>%
pander(
split.tables = Inf,
big.mark = ",",
justify = "lrrr",
style = "rmarkdown",
emphasize.strong.rows = 8,
caption = "*Recovery rate from demultiplexing the 1996 and 2012 samples, after adapter removal*"
)
| Library | Total Sequences | DeMultiplexed | Recovery Rate |
|---|---|---|---|
| FCC21WPACXX-CHKPEI13070002_L6 | 67,032,866 | 53,568,147 | 79.913% |
| FCC2GPDACXX-CHKPEI13070005_L3 | 76,021,945 | 70,702,537 | 93.003% |
| FCC2GPDACXX-CHKPEI13070006_L4 | 68,811,191 | 66,677,690 | 96.899% |
| FCC229TACXX-CHKPEI13070001_L3 | 55,762,854 | 55,037,971 | 98.700% |
| FCC21WPACXX-CHKPEI13070003_L7 | 63,542,676 | 62,787,380 | 98.811% |
| FCC2GPDACXX-CHKPEI13070004_L2 | 76,201,274 | 75,316,445 | 98.839% |
| FCC2GPDACXX-CHKPEI13070007_L6 | 64,940,945 | 64,349,296 | 99.089% |
| Total | 472,313,751 | 448,439,466 | 95% |
cp <- paste("*Read Totals for each of the 1996 & 2012 samples.",
"The black line indicates the mean library size,",
"whilst the dashed green line indicates the mean library size based",
"on the original library before demultiplexing.",
"Bar colours indicate sample population as 1996 (blue) or 2012 (red).*")
plotly::ggplotly(
deMuxFqc %>%
magrittr::extract(grepl("(ora|gc).+1.fq.gz", fqName(.))) %>%
readTotals() %>%
mutate(ID = str_remove(Filename, ".1.fq.gz")) %>%
left_join(samples) %>%
mutate(Population = case_when(
grepl("gc", ID) ~ "1996",
grepl("ora", ID) ~ "2012"
)) %>%
ggplot(aes(ID, Total_Sequences, fill = Population)) +
geom_bar(stat = "identity") +
geom_hline(
aes(yintercept = mn),
data = . %>% summarise(mn = mean(Total_Sequences))
) +
geom_hline(
aes(yintercept = mn),
data = . %>% group_by(Library) %>% summarise(mn = mean(Total_Sequences)),
colour = "green",
linetype = 2
) +
facet_wrap(~Library, scales = "free_x", nrow = 2) +
scale_y_continuous(labels = comma) +
scale_fill_manual(values = c(rgb(0.1, 0.1, 0.7), rgb(0.7, 0.1, 0.1))) +
labs(x = c(), y = c()) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90),
legend.position = "none"
)
)
Read Totals for each of the 1996 & 2012 samples. The black line indicates the mean library size, whilst the dashed green line indicates the mean library size based on the original library before demultiplexing. Bar colours indicate sample population as 1996 (blue) or 2012 (red).
In summary, of the 486,524,954 initial reads, a total of 448,439,466 were retained after trimming and demultiplexing. This represents a combined recovery rate of 92% from the initial libraries.
The revised strategy showed considerable improvement in yield for the libraries FCC21WPACXX-CHKPEI13070002_L6 and FCC21WPACXX-CHKPEI13070003_L7 which both contained exculsively 1996 samples. As such the revised strategy was pursued further.
list(
origFqc %>%
readTotals() %>%
mutate(ID = str_remove(Filename, ".Lib..[12].fq"),
Type = "Previous"),
deMuxFqc %>%
readTotals() %>%
mutate(ID = str_remove(Filename, ".[12].fq.gz"),
Type = "Repeat")
) %>%
bind_rows() %>%
distinct(ID, Type, Total_Sequences) %>%
dplyr::select(ID, Type, Total_Sequences) %>%
filter(grepl("(gc|ora)", ID)) %>%
mutate(Population = case_when(
grepl("gc", ID) ~ "1996",
grepl("ora", ID) ~ "2012"
)) %>%
left_join(samples) %>%
ggplot(aes(Library, Total_Sequences, fill = Type)) +
geom_boxplot() +
labs(y = "Total Reads") +
scale_y_continuous(labels = comma) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Comparison of library sizes after demultiplexing using sabre (blue), vs the original method using stacks (salmon).
list(
origFqc %>%
readTotals() %>%
mutate(ID = str_remove(Filename, ".Lib..[12].fq"),
Type = "Previous"),
deMuxFqc %>%
readTotals() %>%
mutate(ID = str_remove(Filename, ".[12].fq.gz"),
Type = "Repeat")
) %>%
bind_rows() %>%
distinct(ID, Type, Total_Sequences) %>%
dplyr::select(ID, Type, Total_Sequences) %>%
filter(grepl("(gc|ora)", ID)) %>%
spread("Type", "Total_Sequences") %>%
mutate(Change = Repeat / Previous,
Population = case_when(
grepl("gc", ID) ~ "1996",
grepl("ora", ID) ~ "2012"
)) %>%
left_join(samples) %>%
ggplot(aes(ID, Change, fill = Population)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = 1, linetype = 2) +
facet_wrap(~Library, scales = "free_x", nrow = 2) +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c(rgb(0.1, 0.1, 0.7, 0.8), rgb(0.7, 0.1, 0.1, 0.8))) +
labs(x = c(), y = "% Improvement") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90),
legend.position = "none")
Changes in library size using the improved recovery strategy. All samples showed an improved rate of recovery.
list(
origFqc %>%
magrittr::extract(!grepl("(ora|gc)", fqName(.))) %>%
readTotals(),
deMuxFqc %>%
magrittr::extract(grepl("1.fq", fqName(.))) %>%
readTotals()
) %>%
bind_rows() %>%
mutate(Population = case_when(
grepl("ora", Filename) ~ 2012,
grepl("gc", Filename) ~ 1996,
!grepl("(ora|gc)", Filename) ~ 2010
),
Population = as.factor(Population)) %>%
mutate(ID = str_remove(Filename, ".[12].fq.gz")) %>%
ggplot(aes(Population, Total_Sequences, fill = Population)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
theme_bw()
Comparison of library sizes for the three populations.
deMuxFqc %>%
magrittr::extract(grepl("(ora|gc)", fqName(.))) %>%
plotGcContent(plotType = "line", usePlotly = TRUE, theoreticalGC = FALSE)
GC content for all 1996 and 2012 samples.
Inspection by GC content showed that gc2709 and gc2700 appeared to have an exaggerated peak around 60%, whilst all other samples showed a more broad spread across the range. From the Turretfield samples (previous analysis), pt1125 showed an unexpected peak around 50% indicating that sample may contain reads from a different species. This sample should be excluded from all further analysis.
origFqc %>%
magrittr::extract(!grepl("(ora|gc)", fqName(.))) %>%
plotGcContent(plotType = "line", usePlotly = TRUE, theoreticalGC = FALSE)
GC content for all Turretfield samples.
deMuxFqc %>%
plotSeqLengthDistn(usePlotly = TRUE, dendrogram = TRUE)
Length Distribution for all files
plotSeqContent(deMuxFqc, usePlotly = TRUE, dendrogram = TRUE, cluster = TRUE)
Sequence content showing the presence of the RS in both the Turretfield and R2 samples.
pander(sessionInfo())
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tidyverse(v.1.3.0), pander(v.0.6.3), scales(v.1.1.0), magrittr(v.1.5), ngsReports(v.1.1.2), tibble(v.2.1.3), ggplot2(v.3.2.1) and BiocGenerics(v.0.32.0)
loaded via a namespace (and not attached): colorspace(v.1.4-1), hwriter(v.1.3.2), ellipsis(v.0.3.0), XVector(v.0.26.0), GenomicRanges(v.1.38.0), ggdendro(v.0.1-20), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.1), ggrepel(v.0.8.1), lubridate(v.1.7.4), xml2(v.1.2.2), codetools(v.0.2-16), leaps(v.3.0), knitr(v.1.26), zeallot(v.0.1.0), jsonlite(v.1.6), Rsamtools(v.2.2.1), Cairo(v.1.5-10), broom(v.0.5.2), cluster(v.2.1.0), dbplyr(v.1.4.2), shiny(v.1.4.0), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), fastmap(v.1.0.1), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.1.1.0), later(v.1.0.0), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), reshape2(v.1.4.3), FactoMineR(v.2.0), ShortRead(v.1.44.0), Rcpp(v.1.0.3), Biobase(v.2.46.0), cellranger(v.1.1.0), vctrs(v.0.2.0), Biostrings(v.2.54.0), nlme(v.3.1-143), crosstalk(v.1.0.0), xfun(v.0.11), rvest(v.0.3.5), mime(v.0.7), lifecycle(v.0.1.0), zlibbioc(v.1.32.0), MASS(v.7.3-51.4), zoo(v.1.8-6), promises(v.1.1.0), hms(v.0.5.2), SummarizedExperiment(v.1.16.0), RColorBrewer(v.1.1-2), yaml(v.2.2.0), latticeExtra(v.0.6-28), stringi(v.1.4.3), highr(v.0.8), S4Vectors(v.0.24.0), BiocParallel(v.1.20.0), truncnorm(v.1.0-8), GenomeInfoDb(v.1.22.0), rlang(v.0.4.2), pkgconfig(v.2.0.3), matrixStats(v.0.55.0), bitops(v.1.0-6), evaluate(v.0.14), lattice(v.0.20-38), GenomicAlignments(v.1.22.1), htmlwidgets(v.1.5.1), labeling(v.0.3), tidyselect(v.0.2.5), plyr(v.1.8.4), R6(v.2.4.1), IRanges(v.2.20.1), generics(v.0.0.2), DelayedArray(v.0.12.0), DBI(v.1.0.0), pillar(v.1.4.2), haven(v.2.2.0), withr(v.2.1.2), scatterplot3d(v.0.3-41), RCurl(v.1.95-4.12), modelr(v.0.1.5), crayon(v.1.3.4), plotly(v.4.9.1), rmarkdown(v.1.18), grid(v.3.6.2), readxl(v.1.3.1), data.table(v.1.12.6), reprex(v.0.3.0), digest(v.0.6.23), flashClust(v.1.01-2), webshot(v.0.5.2), xtable(v.1.8-4), httpuv(v.1.5.2), stats4(v.3.6.2), munsell(v.0.5.0), viridisLite(v.0.3.0) and kableExtra(v.1.1.0)